import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import re
/Users/lettyuy/opt/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.0
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
df = pd.read_csv("Hot 100.csv")
df['chart_date'] = pd.to_datetime(df['chart_date'])
df['chart_debut'] = pd.to_datetime(df['chart_debut'])
df['chart_year'] = df['chart_date'].dt.year
df.head()
| chart_position | chart_date | song | performer | song_id | instance | time_on_chart | consecutive_weeks | previous_week | peak_position | worst_position | chart_debut | chart_url | chart_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 84 | 1990-05-05 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 1 | NaN | NaN | 84 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 1 | 78 | 1990-05-12 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 2 | 1.0 | 84.0 | 78 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 2 | 68 | 1990-05-19 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 3 | 2.0 | 78.0 | 68 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 3 | 60 | 1990-05-26 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 4 | 3.0 | 68.0 | 60 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
| 4 | 58 | 1990-06-02 | "B" Girls | Young And Restless | "B" GirlsYoung And Restless | 1.0 | 5 | 4.0 | 60.0 | 58 | 84 | 1990-05-05 | https://www.billboard.com/charts/hot-100/1990-... | 1990 |
#Create a new column extracting each individual artist from musical collaborations
df['individual_artist'] = df['performer'].apply(lambda x: re.split(r',|&| and | featuring | feat\. | ft\. ', x, flags=re.IGNORECASE))
df = df.explode('individual_artist')
df['individual_artist'] = df['individual_artist'].str.strip()
#Calculated the average chart positions for each song + individual artist
avg_chart_positions = df.groupby(['song', 'individual_artist'])['chart_position'].mean().round().astype(int).reset_index()
avg_chart_positions = avg_chart_positions.rename(columns={'chart_position': 'avg_chart_position'})
df = pd.merge(df, avg_chart_positions, on=['song', 'individual_artist'], how='left')
#Filter dataframe for only #1s
df_at_1 = df[df['chart_position'] == 1]
#Made sure songs are unique after filtering for #1 songs only
unique_songs_at_1 = df_at_1.groupby(['individual_artist', 'song']).size().reset_index().rename(columns={0: 'count'})
individual_artist_hits = unique_songs_at_1.groupby('individual_artist').size()
#Created a list of artists with only one hit and created a temporary dataframe to house the data
one_hit_artists_list = individual_artist_hits[individual_artist_hits == 1].index.tolist()
df_one_hit_wonders = df[(df['chart_position'] == 1) & df['individual_artist'].isin(one_hit_artists_list)]
df_one_hit_wonders = df_one_hit_wonders.drop_duplicates(subset=['song', 'individual_artist'])
#Created a list of artists with greater than or equal to 3 hits and created a temporary dataframe to house the data
artists_with_staying_power_list = individual_artist_hits[individual_artist_hits >= 3].index.tolist()
df_artists_with_staying_power = df[(df['chart_position'] == 1) & df['individual_artist'].isin(artists_with_staying_power_list)]
df_artists_with_staying_power = df_artists_with_staying_power.drop_duplicates(subset=['song', 'individual_artist'])
#Created a function to get only the top 10 per year in each of our two groups
def get_top_10_per_year(group):
return group.nlargest(10, 'consecutive_weeks')
#Stored the labels under the Source column for each group
df_one_hit_wonders['Source'] = 'One Hit Wonders'
df_artists_with_staying_power['Source'] = 'Artists with Staying Power'
#Group the data of each group by year, and apply our function
top_10_one_hit_wonders_yearly = df_one_hit_wonders.groupby('chart_year').apply(get_top_10_per_year).reset_index(drop=True)
top_10_staying_power_yearly = df_artists_with_staying_power.groupby('chart_year').apply(get_top_10_per_year).reset_index(drop=True)
#Combine the two data sets into one
top_10_combined_yearly = pd.concat([top_10_one_hit_wonders_yearly, top_10_staying_power_yearly])
#Clean up column names accordingly
top_10_combined_yearly.rename(columns={'time_on_chart': 'Time on Chart', 'avg_chart_position': 'Average Chart Position', 'song': 'Song'}, inplace=True)
#Store the years and sources in variables
all_years = top_10_combined_yearly['chart_year'].unique()
all_sources = ["One Hit Wonders", "Artists with Staying Power"]
#Create Dataframe that has a row for every year-source combination to prepare it for data visualization.
expanded_data = []
for year in all_years:
for Source in all_sources:
subset = top_10_combined_yearly[(top_10_combined_yearly['chart_year'] == year) & (top_10_combined_yearly['Source'] == Source)]
if subset.empty:
expanded_data.append({
'chart_year': year,
'Source': Source,
'Average Chart Position': np.nan,
'Time on Chart': np.nan,
'individual_artist': f'Placeholder {Source} {year}'
})
else:
expanded_data.extend(subset.to_dict('records'))
expanded_df = pd.DataFrame(expanded_data)
expanded_df.head()
| chart_position | chart_date | Song | performer | song_id | instance | Time on Chart | consecutive_weeks | previous_week | peak_position | worst_position | chart_debut | chart_url | chart_year | individual_artist | Average Chart Position | Source | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1958-11-29 | To Know Him, Is To Love Him | The Teddy Bears | To Know Him, Is To Love HimThe Teddy Bears | 1.0 | 11.0 | 10.0 | 3.0 | 1.0 | 88.0 | 1958-09-20 | https://www.billboard.com/charts/hot-100/1958-... | 1958 | The Teddy Bears | 24.0 | One Hit Wonders |
| 1 | 1.0 | 1958-11-08 | It's Only Make Believe | Conway Twitty | It's Only Make BelieveConway Twitty | 1.0 | 9.0 | 8.0 | 2.0 | 1.0 | 65.0 | 1958-09-13 | https://www.billboard.com/charts/hot-100/1958-... | 1958 | Conway Twitty | 22.0 | One Hit Wonders |
| 2 | 1.0 | 1958-11-15 | Tom Dooley | The Kingston Trio | Tom DooleyThe Kingston Trio | 1.0 | 8.0 | 7.0 | 2.0 | 1.0 | 83.0 | 1958-09-27 | https://www.billboard.com/charts/hot-100/1958-... | 1958 | The Kingston Trio | 19.0 | One Hit Wonders |
| 3 | 1.0 | 1958-09-27 | It's All In The Game | Tommy Edwards | It's All In The GameTommy Edwards | 1.0 | 7.0 | 6.0 | 3.0 | 1.0 | 96.0 | 1958-08-16 | https://www.billboard.com/charts/hot-100/1958-... | 1958 | Tommy Edwards | 19.0 | One Hit Wonders |
| 4 | 1.0 | 1958-08-23 | Little Star | The Elegants | Little StarThe Elegants | 1.0 | 4.0 | 3.0 | 2.0 | 1.0 | 18.0 | 1958-08-02 | https://www.billboard.com/charts/hot-100/1958-... | 1958 | The Elegants | 18.0 | One Hit Wonders |
#Visualize data!
fig = px.scatter(
expanded_df,
x="Average Chart Position",
y="Time on Chart",
animation_frame="chart_year",
animation_group="individual_artist",
hover_name="individual_artist",
hover_data={"Song": True, "Source": False, "chart_year": False},
color="Source",
size_max=55,
range_x=[top_10_combined_yearly['Average Chart Position'].max(), top_10_combined_yearly['Average Chart Position'].min()],
range_y=[0, top_10_combined_yearly['Time on Chart'].max()]
)
x_mid = 30
y_mid = 30
fig.add_shape(
go.layout.Shape(
type="line",
x0=x_mid,
x1=x_mid,
y0=0,
y1=top_10_combined_yearly['Time on Chart'].max(),
line=dict(color="Black", dash="dash", width=0.5)
)
)
fig.add_shape(
go.layout.Shape(
type="line",
x0=top_10_combined_yearly['Average Chart Position'].max(),
x1=top_10_combined_yearly['Average Chart Position'].min(),
y0=y_mid,
y1=y_mid,
line=dict(color="Black", dash="dash", width=0.5)
)
)
fig.add_annotation(
text="Letty Uy - CIS 9655",
xref="paper",
yref="paper",
x=1,
y=1,
showarrow=False,
font=dict(
size=12,
color="black"
)
)
fig.add_annotation(
text="Lower average rank, long duration",
x=x_mid + (x_mid / 2),
y=y_mid + (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="Higher average rank, long duration",
x=x_mid - (x_mid / 2),
y=y_mid + (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="Higher average rank, short duration",
x=x_mid - (x_mid / 2),
y=y_mid - (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="Lower average rank, short duration",
x=x_mid + (x_mid / 2),
y=y_mid - (y_mid / 2),
showarrow=False
)
fig.add_annotation(
text="Letty Uy - CIS 9655",
xref="paper",
yref="paper",
x=1,
y=1,
showarrow=False,
font=dict(
size=12,
color="black"
)
)
fig.update_layout(
xaxis_title="Average Chart Position",
yaxis_title="Total Weeks at #1"
)
fig.show()